NBIS ID: SMS-6112
Report Version: 1.0
Request by: Mattias Westman (mattias.westman@ki.se)
Principal Investigator: Carl-Johan Sundberg (carl.j.sundberg@ki.se)
Organisation: KI
NBIS Staff: Juliana Assis (juliana.assis@nbis.se)
source('https://raw.githubusercontent.com/nimarafati/RNA_Seq_Pipeline/master/RNA_Seq/DE_script.R')
## Paths
path <- '/Users/juliana/Documents/GitHub/SMS-6112-22-MB/'
## LIBRARIES
# data handling
library(dplyr)
library(tidyr)
library(stringr)
# tables
library(kableExtra) # complete table
library(formattable) # table with conditional formatting
library(DT) # interactive table
# graphics
library(ggplot2) # static graphics
# interactive graphics
library(highcharter)
library(plotly)
library(ggiraph) # convert ggplot to interactive
library(dygraphs) # time series
library(networkD3) # network graph
library(leaflet) # interactive maps
library(crosstalk) # linking plots
# analysis
library("devtools")
library("bsselectR")
library("dplyr")
library("ggplot2")
library("tidyr")
library("DECIPHER")
library("gplots")
library("gridExtra")
library("grid")
library("ggpubr")
library("reshape2")
library("reshape")
library("ggrepel")
library("ggh4x")
library("RColorBrewer")
library("tidyverse")
library("DESeq2") # Bioconductor
library('ggfortify')
library("AnnotationDbi")
library("FactoMineR")
library("factoextra")
library("PCAtools")
library("Rqc")
library("gt")
library("clusterProfiler")
library("rnaseqGene")
#library("refGenome")
library("plotly")
library("biomaRt")
library("ggridges")
library("viridis")
library("hrbrthemes")
## Functions
#Perform DE analysis
perform_DE <- function(dds, contrast_name, log2FC_cutoff, padjust_cutoff, file_name, volcano, genes, file_path){
res_obj <- results(dds, contrast = contrast_name, lfcThreshold = log2FC_cutoff, alpha = padjust_cutoff)
res <- res_obj
res$gene_id <- rownames(res_obj)
res <- inner_join(x = as.data.frame(res), y = genes, by = 'gene_id')
res <- res[!is.na(res$padj),]
res$diffexpressed <- 'No'
res$diffexpressed[(res$log2FoldChange >= log2FC_cutoff & res$padj <= 0.05)] <- 'Up-regulated'
res$diffexpressed[(res$log2FoldChange <= -log2FC_cutoff & res$padj <= 0.05)] <- 'Down-regulated'
res$delabel <- 'NA'
res[(res$diffexpressed != 'No'), 'delabel'] <- res[(res$diffexpressed != 'No'),'gene_name']
# Volcano
if(volcano == T){
png(paste0(file_path, file_name, '_volcano.png'), width = 3000, height = 2000, res = 300)
p <- ggplot(data= as.data.frame(res), aes(x = log2FoldChange, y = -log10(padj), col = diffexpressed, label = delabel)) +
geom_point() +
theme_classic() +
scale_colour_manual(values = c('blue', 'black', 'red')) +
geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff), col = 'red') +
geom_hline(yintercept = -log10(padjust_cutoff), col = 'red') +
ggtitle(file_name) +
theme(plot.title = element_text(hjust = 0.5))
print(p)
dev.off()
# ntd <- normTransform(dds)
# select <- res[(res$diffexpressed == 'Down-regulated'), 'gene_id']
# df <- as.data.frame(colData(dds)[,c('Cell_group', 'Abeta', 'Sex')])
# pheatmap(assay(ntd)[select,],
# cluster_rows = F,
# cluster_cols = T,
# show_rownames = F,
# annotation_col = df)
}
write.table(res, paste0(file_path, 'DE_results.txt'), col.names = T, row.names = F, sep = '\t', quote = F)
res_results <- list(DE_table = res,
Volcanot_plot = p,
res_obj = res_obj)
saveRDS(res_results, paste0(file_path, 'res_results.rds'))
return(res_results)
}
# Plot PC3, PC4
plotPCA_PCs <- function (object, ...)
{
.local <- function (object, intgroup = "condition", ntop = 500,
returnData = FALSE)
{
rv <- rowVars(assay(object))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
length(rv)))]
pca <- prcomp(t(assay(object)[select, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
if (!all(intgroup %in% names(colData(object)))) {
stop("the argument 'intgroup' should specify columns of colData(dds)")
}
intgroup.df <- as.data.frame(colData(object)[, intgroup,
drop = FALSE])
group <- if (length(intgroup) > 1) {
factor(apply(intgroup.df, 1, paste, collapse = ":"))
}
else {
colData(object)[[intgroup]]
}
d <- data.frame(PC1 = pca$x[, PC1], PC2 = pca$x[, PC2], group = group,
intgroup.df, name = colnames(object))
if (returnData) {
attr(d, "percentVar") <- percentVar[1:length(percentVar)]
return(d)
}
geom_point(size = 3) + xlab(paste0(PC1, ": ", round(percentVar[PC1] *
100), "% variance")) + ylab(paste0(PC2, ": ", round(percentVar[PC2] *
100), "% variance")) + coord_fixed()
}
.local(object, ...)
}
ridgeline_plot <- function(ego_result, plot_name, fc_sorted){
t <- ego_result
t <- t %>% mutate(geneID = strsplit(as.character(geneID), '/')) %>% unnest(geneID) %>% filter(geneID != '') %>% dplyr::select(geneID, c('ONTOLOGY', 'Description', 'geneID'))
t <- data.frame(t, logFC = fc_sorted[t$geneID])
for(ont in unique(t$ONTOLOGY)){
png(paste0(plot_name, '_', ont, '.png'), res = 200, height = 2000, width = 2000)
p <- ggplot(t[(t$ONTOLOGY == ont),], aes(x = logFC, y = Description, fill = ..x..)) +
geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
scale_fill_viridis(name = "logFC", option = "C") +
labs(title = paste0('logFC distribution of genes in enriched GO terms (', ont, ')')) +
theme_ipsum() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
)
print(p)
dev.off()
}
}
## Enrichment analysis
### GO
run_ego <- function(DEG_entrez_id, OrgDb, ont, all_entrez_id, min_gene_Size, max_gene_Size, path_to_write) {
setwd(path_to_write)
ego <- clusterProfiler::enrichGO(DEG$entrezgene_id,
OrgDb = org_db,
ont = ontology_group,
universe = all_entrez_id,
minGSSize = min_gene_Size,
maxGSSize = max_gene_Size)
if(nrow(ego) > 0 && !is.null(ego)){
write.table(ego , 'enrichGO_results.txt', col.names = T, row.names = F, quote = F, sep = '\t')
p_ego<- clusterProfiler::dotplot(ego, split="ONTOLOGY", font.size = 8, showCategory = nrow(ego), title = 'GO enrichment analysis') + facet_grid(ONTOLOGY~., scale="free") + theme(plot.title = element_text(hjust = 0.5))
## ridgeline plot
ridgeline_plot(ego_result = ego@result, plot_name = 'ridgeline_FDR_0.05', fc_sorted = fc_sorted)
}else{
write.table('No GO enrichment result' , 'enrichGO_results.txt', col.names = T, row.names = F, quote = F, sep = '\t')
p_ego <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
}
print(p_ego)
dev.off()
return(ego)
}
#### GSEGO
run_gsego <- function (fc_sorted, OrgDb, ontology_group, keyType, min_gene_Size, max_gene_Size, pvalueCutoff, path_to_write){
setwd(path_to_write)
gsego <- clusterProfiler::gseGO(geneList=fc_sorted,
ont = ontology_group,
keyType = keyType,
minGSSize = min_gene_Size,
maxGSSize = max_gene_Size,
pvalueCutoff = pvalueCutoff,
pAdjustMethod = 'BH',
verbose = TRUE,
OrgDb = org_db)
if(!is.null(nrow(gsego)) && nrow(gsego) > 0 ){
write.table(gsego , 'GSEA_GO_FDR_0.05.txt', col.names = T, row.names = F, quote = F, sep = '\t')
## dotplot
png('dotplot_GSEA_GO_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsego_dotplot <- clusterProfiler::dotplot(gsego, showCategory=nrow(gsego), split=".sign", font.size = 8, title = 'Gene set enrichment analysis of GO terms' ) + facet_grid(.~.sign) + theme(plot.title = element_text(hjust = 0.5))
print(p_gsego_dotplot)
dev.off()
## Gene set enrichment plot for each significant term
cntr <- 1
for(go_id in as.character(gsego@result$ID)){
cat('\r', paste0('visualising ',go_id,' GO'))
Description <- gsub( x = gsub(pattern = '/', '_', perl = T,gsego$Description[cntr]), replacement = "_", pattern = ' ')
png(paste0('GSEA_GO_FDR_0.05_', Description, '.png'), res = 200, height = 2000, width = 2000)
p_gsego_gse <- clusterProfiler::gseaplot(gsego, by = ontology_group, title = gsego$Description[cntr], geneSetID = cntr)
print(p_gsego_gse)
dev.off()
cntr <- cntr + 1
}
}else{
write.table('No GSEA GO result', 'GSEA_GO_FDR_0.05.txt', col.names = F, row.names = F, quote = F, sep = '\t')
png('dotplot_GSEA_GO_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsego_dotplot <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p_gsego_dotplot)
dev.off()
}
return(gsego)
}
### KEGG
run_ekegg <- function (fc_sorted, organism, keyType, all_entrez_id, minGSSize, pvalueCutoff, path_to_write){
setwd(path_to_write)
ekegg <- clusterProfiler::enrichKEGG(names(fc_sorted), organism = organism, keyType = keyType, pvalueCutoff = pvalueCutoff, pAdjustMethod = "BH", universe = as.character(all_entrez_id), maxGSSize = 1000, minGSSize = 10)
ekegg@result$ratio <- sapply(strsplit(ekegg@result$GeneRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2]))
if(!is.null(nrow(ekegg)) && nrow(ekegg) > 0 ){
write.table(ekegg , 'enrichKEGG_results.txt', col.names = T, row.names = F, quote = F, sep = '\t')
ekegg_FDR_0.05 <- ekegg
ekegg_FDR_0.05@result <- ekegg@result[(ekegg@result$p.adjust <= 0.05), ]
png('dotplot_KEGG_Enrichment_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_ekegg <- clusterProfiler::dotplot(ekegg_FDR_0.05, font.size = 8, showCategory = nrow(ekegg_FDR_0.05), title = 'KEGG enrichment analysis') + theme(plot.title = element_text(hjust = 0.5))
print(p_ekegg)
dev.off()
}else{
write.table('No KEGG enrichment result' , 'enrichKEGG_results.txt', col.names = F, row.names = F, quote = F, sep = '\t')
png('dotplot_KEGG_Enrichment_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsego_dotplot <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p_gsego_dotplot)
dev.off()
p_ekegg <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p_ekegg)
dev.off()
return(ekegg)
}
}
#### GSEA_KEGG
run_gsekegg <- function(fc_sorted, organism, keyType, eps = 0, minGSSize, pvalueCutoff, path_to_write){
setwd(path_to_write)
gsekegg <- clusterProfiler::gseKEGG(fc_sorted, organism = org, keyType = gene_type_conversion, pvalueCutoff = pvalueCutoff, eps = eps, minGSSize = min_gene_Size, maxGSSize = max_gene_Size, pAdjustMethod = "BH", nPermSimple = 10000)
if( !is.null(nrow(gsekegg)) && nrow(gsekegg) > 0){
gsekegg_FDR_0.05 <- gsekegg
gsekegg_FDR_0.05@result <- gsekegg@result[(gsekegg@result$p.adjust <= 0.05), ]
write.table(gsekegg_FDR_0.05 , 'GSEA_KEGG_FDR_0.05.txt', col.names = T, row.names = F, quote = F, sep = '\t')
png('dotplot_GSEA_KEGG_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsekegg_dotplot <- clusterProfiler::dotplot(gsekegg_FDR_0.05, showCategory = 10, title = "Enriched Pathways" , split=".sign") + facet_grid(.~.sign) + theme(plot.title = element_text(hjust = 0.5))
print(p_gsekegg_dotplot)
dev.off()
}else{
write.table('NO GSEA KEGG result' , 'GSEA_KEGG_FDR_0.05.txt', col.names = F, row.names = F, quote = F, sep = '\t')
png('dotplot_GSEA_KEGG_FDR_0.05.png', res = 200, height = 2000, width = 2000)
p_gsekegg_dotplot <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p_gsekegg_dotplot)
dev.off()
}
}
#### pathview
run_pathview <- function(gsekegg_FDR_0.05, fc_sorted, org, path_to_write){
setwd(path_to_write)
if(!is.null(nrow(gsekegg_FDR_0.05)) && nrow(gsekegg_FDR_0.05) >0 ){
cntr <- 1
for(ptw in as.character(gsekegg_FDR_0.05@result$ID)){
cat('\r', paste0('Saving ',ptw,' pathway'))
Description <- gsub( x = gsub(pattern = '/', '_', perl = T,gsekegg_FDR_0.05$Description[cntr]), replacement = "_", pattern = ' ')
pathview(gene.data=fc_sorted, pathway.id=ptw, species = org, same.layer = 2)
png(paste0('GSEA_KEGG_FDR_0.05_', Description, '.png'), res = 200, height = 2000, width = 2000)
p <- gseaplot(gsekegg_FDR_0.05, by = "all", title = gsekegg_FDR_0.05$Description[cntr], geneSetID = cntr)
print(p)
dev.off()
cntr <- cntr + 1
}
}else{
png(paste0('GSEA_KEGG_FDR_0.05.png'), res = 200, height = 2000, width = 2000)
p <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
print(p)
dev.off()
}
}Myotis brandtii are known to have at least a ten-fold longer life expectancy compared to mice of similar body mass, and the highest relative longevity of any mammals. To this end we have collected fresh snap-frozen complete organ systems from these bats during both winter hibernation and summer active period. We thus want to compare gene expressions and control of gene expressions across organs and between these groups. The whole genome is sequenced and available, but the pipeline from sequences to count tables is not established.
The list of contrasts of interest is:
genes based on the row-wise variance (no statistical support)
-Variant between Summer vs Winter (Bat1 vs Bat2)
-Variant between Organs and S/W
> –Intestine-S & Intestine-W,
–Kidney-S & Kidney-W, –Skeletal_muscle-S & Skeletal_muscle-W
There are 12 samples where 5 belongs to Library1, 5 to Library2 and 2 to Library3
Type of data
bulk RNASeq
Data location
cd /proj/snic2022-22-672/private/SMS_6112_RNASeq_Bat_Library1_2_3/
SNIC 2022/23-355 SNIC Small Storage
SNIC 2022/22-672 SNIC Small Compute
NCBI Myotis brandtii Annotation Release 101
Partial genome with low quality annotation
nfcore-rnaseq
Trinity not shown
Reads were aligned to to the Myotis brandtii Annotation Release 101 genome by using nf-core/rnaseq pipeline (3.9) in Reproducibility section you can find all the tools with version used in nf-core pipeline. For downstream analyses we used expression values generated by.
In following table you can find list of samples and metadata.
library("GenomicFeatures")
library("ensembldb")
gtf2 <- rtracklayer::import("/Users/juliana/Documents/GitHub/SMS-6112-22-MB/data/GCF_000412655.1_ASM41265v1_genomic.gtf")
TxDb <- makeTxDbFromGFF("/Users/juliana/Documents/GitHub/SMS-6112-22-MB/data/GCF_000412655.1_ASM41265v1_genomic.gtf")
# Convert to gene names
mart <- useMart("ENSEMBL_MART_ENSEMBL")
mart <- useDataset(mart=mart,dataset="mlucifugus_gene_ensembl")
myattributes <- c("ensembl_gene_id",
"entrezgene_id",
"external_gene_name",
"chromosome_name",
"start_position",
"end_position",
"strand",
"gene_biotype",
"description")
bdata <- getBM(mart=mart,attributes=myattributes,uniqueRows=T,
useCache=FALSE)
# remove duplicated gene_ids
annotation <- bdata[!duplicated(bdata$ensembl_gene_id),c('ensembl_gene_id', 'external_gene_name', 'entrezgene_id')]
colnames(annotation) <- c('gene_id', 'Name', 'entrezgene_id')
annotation[(annotation$Name == ''), 'Name'] <- 'Uncharacterised'
write.table(annotation, paste0('/Users/juliana/Documents/GitHub/SMS-6112-22-MB/results/annotation.txt'), col.names = T, row.names = F, sep = '\t')
#ensDbFromAHHere we load normalized counts scaled for length generated by salmon (version) [@] in nf-core/rnaseq pipeline. We tested different model and dependent on the contrast we chose one of them for downstream analysis.
exp <- readRDS("/Users/juliana/Documents/GitHub/SMS-6112-22-MB/results/ALL_Libraries/R_Files/salmon.merged.gene_counts_length_scaled.rds")
exp2 <- exp
assay(exp, 1) <- round(data.matrix(assay(exp, 1)))
assay(exp, 2) <- NULL
# Add metadata
exp2$sample_id <- as.factor(samples_info$sample_id)
exp2$bat <- as.factor(samples_info$bat)
exp2$organ <- as.factor(samples_info$organ)
exp2$library <- as.factor(samples_info$library)
exp2$organ_bat <- as.factor(samples_info$organ_bat)
counts <- assay(exp2, "counts")
dds <- DESeqDataSetFromMatrix(countData = round(counts),
colData = samples_info,
design = ~ 0 + organ + bat)We filter out low-count genes before downstream analysis by using at least 20 reads in total (including all samples).
keep <- rowSums(counts(dds)) >= 20
dds <- dds[keep,]This high-dimensional count data can - after normalization - be visualized in two dimensions in a Principal Component Analysis (PCA). Using PCA, we examine the structure of the data by projecting the samples on a two-dimensional graph using the two first principal components that explain the most variation between those samples. The plot can be used to examine the samples for outliers, sample swaps and other relationships. When normalization successfully removed technical artefacts, the relative distances should be biologically interpretable.
cols_organ <- c("Skeletal muscle" = "#264D59", "Kidney" = "#77A515", "Intestine" = "#8B4513")
vsd <- vst(dds, blind = F)
plotPCA(vsd, "library")
#assay(vsd) <- limma::removeBatchEffect(assay(vsd), vsd$library)
#plotPCA(vsd, "library")
#topVarGenes_p <- head( order( rowVars( assay(vsd) ), decreasing=TRUE ), 500 )
a <- plotPCA(vsd, intgroup = c('sample_id', 'bat', 'organ' ,'library'), returnData = TRUE)
# By organ
teste <- plotPCA(vsd, "bat", "organ")
#$y
#[1] "PC2: 22% variance"
#$x
#[1] "PC1: 51% variance"
png(paste0(path, '/results/ALL_Libraries/results/01_DE/PCA_bat_organ_L1_2_3.png'), width = 4000, height = 3000, res = 300)
#teste$labels get variance
ggplot(a, aes(x = PC1, y = PC2, color = organ, shape = bat)) +
geom_point(size = 4, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.3) +
geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.3) +
scale_shape_manual(values=c(15,17)) +
theme_pubr(border = TRUE) +
#coord_fixed(ratio = 1) +
geom_text_repel(aes(label = library), nudge_x = 0.06, size = 5.0, segment.alpha = 0.8) +
theme(axis.text=element_text(size=14),
axis.text.x = element_text(size = 12, hjust = 0.5),
axis.title.y = element_text(size = 18),
legend.text=element_text(size=14),
legend.title=element_text(size=0),
legend.position="bottom",
axis.title.x = element_text(size = 18),
strip.text.x = element_text(size = 20, face = "bold")) +
labs(x="PC1: 51% variance", y = "PC2: 22% variance", element_text(face = "bold")) +
scale_color_manual(values = cols_organ)
dev.off()#The rlog transform
#https://bioc.ism.ac.jp/packages/2.14/bioc/vignettes/DESeq2/inst/doc/beginner.pdf
#highly variable genes. These are the genes most variable across all samples regardless of which samples they are
#rowVars on the output of vst as this corrects for the biased mean/variance trend and puts data on the log scale
rld <- rlog(dds, blind=F )
# Heatmap of Euclidean sample distances after rlog transformation.
sampleDists <- dist( t( assay(rld) ) )
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( rld$sample_id, rld$organ, rld$bat , sep="-" )
png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_rlog_L1_2_3.png'), width = 4000, height = 3000, res = 300)
colours = colorRampPalette(rev(brewer.pal(9, "Blues")) )(255)
d <- pheatmap(sampleDistMatrix, trace="none", col=colours)
d
dev.off()
#Gene clustering
topVarGenes <- head(order(rowVars(assay(rld)), decreasing=TRUE ), 100)
df <- as.data.frame(colData(dds)[,c("bat","organ", "library")])
df$bat = factor(df$bat, levels = c("Bat_1", "Bat_2"), labels = c("Bat_1", "Bat_2"))
metaR_bat <- c("#C0C0C0","#BC8F8F")
names(metaR_bat) <- levels(df$bat)
df$organ = factor(df$organ, levels = c("Skeletal muscle", "Kidney","Intestine"))
metaR_organ <- c("#264D59", "#77A515", "#8B4513")
names(metaR_organ) <- levels(df$organ)
metaR_AnnColour <- list(
bat = metaR_bat)
metaR_AnnColour2 <- list(
organ = metaR_organ)
metaR_AnnColourx <- list(
organ = metaR_organ,
bat = metaR_bat)
# Check the output
metaR_AnnColourx
SampleOrder = order(df$organ, df$bat)
meta.factors <- df[1:2]
df[1:2]
#png(paste0(path, '/results/ALL_Libraries/results/01_DEHeatMap_topvar500_L1.png'), width = 4000, height = 15500, res = 300)
png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_topvar100_L1_2_3.png'), width = 4000, height = 8000, res = 300)
colsHeat<- c("#F7F7F7", "#92C5DE", "#0571B0", "#F4A582", "#CA0020")
pheatmap2 <- pheatmap(assay(rld)[topVarGenes, SampleOrder],
cluster_cols = FALSE,
cluster_rows = TRUE,
gaps_row = 5,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
annotation_colors = metaR_AnnColourx, annotation_col = meta.factors,
show_colnames = TRUE,
#color = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255),
color = colorRampPalette(c(colsHeat))(255),
border_color = "#f8edeb",
display_numbers = FALSE)
pheatmap2
dev.off()
dev.off()
##
topVarGenes1000 <- head(order(rowVars(assay(rld)), decreasing=TRUE ), 1000)
mat <- assay(vsd)[ topVarGenes1000, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vsd)[, c("bat","organ")])
png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_topvar1000_clus_L1.png'), width = 5000, height = 30500, res = 300)
h_1000 <- pheatmap(mat, annotation_col = anno)
#write.table(mat,file='/Users/juliana/Documents/GitHub/SMS-6112-22-MB/results/ALL_Libraries/results/01_DE/topVarGenes1000_L1_2_3.tsv',quote=FALSE,sep="\t")
dev.off()
##
png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_topvar100_clus_1_L2_3.png'), width = 5000, height = 8000, res = 300)
#png(paste0(path, '/results/ALL_Libraries/results/01_DEHeatMap_topvar500_clus_L1.png'), width = 5000, height = 15500, res = 300)
pheatmap3 <- pheatmap(assay(rld)[topVarGenes, SampleOrder],
cluster_cols = TRUE,
cluster_rows = TRUE,
gaps_row = 5,
scale="row",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
annotation_colors = metaR_AnnColourx, annotation_col = meta.factors,
show_colnames = TRUE,
#color = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255),
color = colorRampPalette(c(colsHeat))(255),
border_color = "#f8edeb",
display_numbers = FALSE)
pheatmap3
dev.off()counts_norm <- reshape2::melt(counts(dds2, normalized = T), varnames = c('gene_id', 'sample_name'), value.name = 'counts')
counts_norm <- inner_join(x = counts_norm, y = as.data.frame(dds2@colData), by = 'sample_name')
png(paste0(path, '/results/ALL_Libraries/results/01_DE/distribution_L1_2_3.png'), width = 3000, height = 2500, res = 300)
distribution <- ggplot(data = counts_norm, aes(x = reorder(sample_name, bat, na.rm = T), y = log2(counts+1))) + geom_boxplot(aes(fill = factor(counts_norm$organ))) + coord_flip() + xlab('sample_name') + scale_color_manual(values = cols_organ) + theme_pubr(border = TRUE)
distribution
dev.off()dds3 <- DESeqDataSetFromMatrix(countData = round(counts),
colData = samples_info,
design = ~ 0 + organ_bat)
dds3 <- DESeq(dds3)
resultsNames(dds3)
#[1] "organ_batIntestine_Bat_1" "organ_batIntestine_Bat_2" "organ_batKidney_Bat_1"
#[4] "organ_batKidney_Bat_2" "organ_batSkeletal.muscle_Bat_1" "organ_batSkeletal.muscle_Bat_2"
log2FC_cutoff <- 1
padjust_cutoff <- 0.05
#Intestine Bat
contrast_name <- list(c('organ_batIntestine_Bat_1', 'organ_batIntestine_Bat_2'))
#res_results <- perform_DE(dds = dds3, contrast_name = contrast_name, log2FC_cutoff = log2FC_cutoff, padjust_cutoff = padjust_cutoff, file_name = intestine, volcano = T, genes = rownames(dds3))
res_obj <- results(dds3, tidy=TRUE, contrast = contrast_name, lfcThreshold = log2FC_cutoff, alpha = padjust_cutoff)
res <- res_obj
colnames(res)[1] <- "gene_id"
res <- res[!is.na(res$padj),]
res$diffexpressed <- 'No'
res$diffexpressed[(res$log2FoldChange >= log2FC_cutoff & res$padj <= 0.05)] <- 'Up-regulated'
res$diffexpressed[(res$log2FoldChange <= -log2FC_cutoff & res$padj <= 0.05)] <- 'Down-regulated'
res$delabel <- 'NA'
res[(res$diffexpressed != 'No'), 'delabel'] <- res[(res$diffexpressed != 'No'),'gene_name']
##
png(paste0(path, '/results/ALL_Libraries/results/01_DE/intestine_volcano_L1_2_3.png'), width = 3000, height = 2500, res = 300)
par(mar=c(5,5,5,5), cex=1.0, cex.main=1.4, cex.axis=1.4, cex.lab=1.4)
topT <- as.data.frame(res)
#Adjusted P values (FDR Q values)
with(topT, plot(log2FoldChange, -log10(padj), pch=20, main="Volcano plot", cex=1.0, xlab=bquote(~Log[2]~fold~change), ylab=bquote(~-log[10]~Q~value)))
with(subset(topT, padj<0.01 & abs(log2FoldChange)>2), points(log2FoldChange, -log10(padj), pch=20, col="red", cex=0.5))
#with(subset(topT, padj<0.05 & abs(log2FoldChange)>2), text(log2FoldChange, -log10(padj), labels=subset(rownames(topT), topT$padj<0.05 & abs(topT$log2FoldChange)>2), cex=0.8, pos=3))
#Add lines for absolute FC>2 and P-value cut-off at FDR Q<0.05
abline(v=0, col="black", lty=3, lwd=1.0)
abline(v=-2, col="black", lty=4, lwd=2.0)
abline(v=2, col="black", lty=4, lwd=2.0)
abline(h=-log10(max(topT$pvalue[topT$padj<0.01], na.rm=TRUE)), col="black", lty=4, lwd=2.0)
dev.off()We filter out low-count genes before downstream analysis by using at least 20 reads in total (including all samples) by which we removed 5932 genes and kept 23060 genes for downstream analysis.
The distribution of normalized expression in all samples are shown in figure 1.
There is a good correlation among samples as shown in figure 2. You can see that organs formed separate cluster.
Heatmap of Euclidean sample distances after rlog transformation
In figures ?? and ?? each point corresponds to a sample plotted on PC1 and PC2. I can see strong separation by organ . Library batch effect has been removed.
PCA all
These are the genes (top 100) most variable across all samples regardless of which samples they are.
TopVar genes results. Table below shows list of 100 genes
TopVar Clustered independent of the sample
TopVar 1000 Clustered independent of the sample